Word Associations, Reported Content

It turns out that words like the n-word and TERF are appearing very frequently in the holyoke and smith confessional respectively. We want to dig deeper into how these and other controversial words are being used.

  • What words appear in the same secret as the words above?
  • What kinds of secrets are reported, and how might they overlap the above words?
  • What word associations can be found in the corpus?
In [42]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import functools

from os import path
from scipy.ndimage import imread
from nltk.util import ngrams
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from IPython.display import display

import cufflinks as cf
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()

plt.style.use('ggplot')
%matplotlib inline
In [16]:
# Reading in data
holyc_df = pd.read_csv('../tmp/clean/holyokecon_confessional_comments.csv')
holyr_df = pd.read_csv('../tmp/clean/holyokecon_confessional_reports.csv')
holys_df = pd.read_csv('../tmp/clean/holyokecon_confessional_secrets.csv')
smithc_df = pd.read_csv('../tmp/clean/smithcon_confessional_comments.csv')
smithr_df = pd.read_csv('../tmp/clean/smithcon_confessional_reports.csv')
smiths_df = pd.read_csv('../tmp/clean/smithcon_confessional_secrets.csv')

# defining some global variables
SECRET_COL = 'clean_tokens_secret'
REPORT_COL = 'clean_tokens_report'
In [14]:
holysr_df = holys_df.merge(holyr_df, left_on='id', right_on="secret_id",
                          how='left', suffixes=('_secret', '_report'))
#preprocess: remove rows with null clean_tokens_secret value
holysr_df = holysr_df[holysr_df[TEXT_COL].notnull()]
holysr_df.head()
Out[14]:
id_secret comments clean_tokens_secret id_report clean_tokens_report secret_id comment_id
0 14040 25 goddamn insomnia NaN NaN NaN NaN
1 13994 19 sleep keep secret NaN NaN NaN NaN
2 10971 15 accident waiting happen 2120495 wrong thread 10971 2120493
3 12515 31 site ruining life NaN NaN NaN NaN
4 9854 10 kick dont believe 1017 troll 9854 90928
In [192]:
# detecting secrets containing a specific word
pattern = r'gay|lesbian|trans|bisex' # niggar|nigger|asian|yellow|latino|white|gay|lesbian|trans|bi
selector = holysr_df[TEXT_COL].str.contains(pattern)
match_df = holysr_df[selector]

# Drop duplicate secrets
match_secrets = match_df.drop_duplicates('id_secret')

# Match not reported
match_not_reported = match_secrets[match_secrets['id_report'].isnull()]

# Match reported
match_reported = match_secrets[match_secrets['id_report'].notnull()]

# Select report text
report_text = match_df[match_df[REPORT_COL].notnull()]
In [194]:
word_cloud_options = {
    'width': 800, 
    'height': 800,
    'background_color': "white", 
    'max_words': 500, 
    'stopwords': STOPWORDS,
    'random_state': 42   
}

def create_word_cloud(text_iterable, image_color_fp=None,
                      title='', **kwargs):
    confesh_coloring = imread(image_color_fp)
    kwargs.update({'mask': confesh_coloring})
    wc = WordCloud(**kwargs)
    text = " ".join(text_iterable)
    wc.generate(text)

    image_colors = ImageColorGenerator(confesh_coloring)

    plt.figure(figsize=(8,8))
    plt.title(title)
    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")
    plt.show()
    
logo_fp = '../assets/logo2.png'
# Word Cloud of Match
create_word_cloud(match_secrets[SECRET_COL].astype(str),
                  logo_fp, title="Holyoke Secrets Containing the word %s" % pattern,
                  **word_cloud_options)
In [195]:
# Defining functions to compute word frequency
def word_counter(text, n=1, length_thres=50):
    t = text.split()
    t = [tk for tk in t if len(tk) < length_thres]
    for i in range(n):
        t_ngrams = [" ".join(b) for b in list(ngrams(t, i + 1))]
        t.extend(t_ngrams)
    return Counter(t)

def word_aggregater(corpus_list, n=1):
    c = Counter()
    for doc in corpus_list:
        c.update(word_counter(doc, n=n))
    return c

def count_token_frequency(token_series, filter_thres, **kwargs):
    freq_df =  pd.DataFrame(word_aggregater(token_series, **kwargs).items())
    freq_df.rename(columns={0: 'word', 1: 'frequency'}, inplace=True)
    freq_df = freq_df[freq_df['frequency'] > filter_thres] \
        .sort_values('frequency', ascending=False)
    freq_df['ngrams'] = freq_df['word'].apply(lambda x: len(x.split()))
    return freq_df.reset_index(drop=True)

# create frequency count dataframes
secrets_corpus = count_token_frequency(match_secrets['clean_tokens_secret'], 5, n=2)
secrets_not_reported_corpus = count_token_frequency(match_not_reported['clean_tokens_secret'], 5, n=2)
secrets_reported_corpus = count_token_frequency(match_reported['clean_tokens_secret'], 5, n=2)
report_text_corpus = count_token_frequency(report_text['clean_tokens_secret'], 5, n=2)
In [237]:
# Filtering secrets by ngrams
secrets_corpus = secrets_corpus[
    secrets_corpus['ngrams'] == 2
]
secrets_not_reported_corpus = secrets_not_reported_corpus[
    secrets_not_reported_corpus['ngrams'] == 2
]
secrets_reported_corpus = secrets_reported_corpus[
    secrets_reported_corpus['ngrams'] == 2
]

# Filtering reported and not reported vocabulary.
# Vocabulary is based on first 100 words in secrets_corpus
vocabulary = secrets_corpus['word'][:30].tolist()
vocab_filter = lambda x: True if x in vocabulary else False
secrets_nr_filtered = secrets_not_reported_corpus[
    secrets_not_reported_corpus['word'].apply(vocab_filter)
]
secrets_r_filtered = secrets_reported_corpus[
    secrets_reported_corpus['word'].apply(vocab_filter)
]
In [238]:
def create_bar_trace(dataframe, graph_obj, **go_kwargs):
    return graph_obj(
        x=dataframe['frequency'],
        y=dataframe['word'],
        **go_kwargs)

trace1 = create_bar_trace(secrets_nr_filtered, go.Bar,
                          name='Not Reported', orientation= 'h',
                          marker={'color': '#bc94d3'})
trace2 = create_bar_trace(secrets_r_filtered, go.Bar,
                          name='Reported', orientation= 'h',
                          marker={'color': '#8551a3'})

data = [trace1, trace2]
layout = go.Layout(
    margin={
        'l': 125
    },
    barmode='stack',
    width=700,
    height=700,
    legend={
        'yanchor': 'top',
        'traceorder': "normal",
        'xanchor': "left",
        'borderwidth': 0,
        'y': 1.20,
        'x': 0,
        'font': {
            'color': "",
            'family': "",
            'size': 18
        },
    },
    yaxis={
        'autorange':'reversed'
    },
    xaxis={
        'mirror': True,
        'side': "top"
    }
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Drawing...
In [7]:
search = raw_input("What do you want to search for?")
print search
What do you want to search for?blah
blah